Client Report - Delay Types

Unit 3 Stretch

Author

Ezekial Curran

Background

In 2003 the Bureau of Transportation Statistics (BTS) began collecting data on the causes of delayed flights. The categories they use are Air Carrier, National Aviation System, Weather, Late-Arriving Aircraft, and Security. You can visit the BTS website to read definitions of these categories.

Using a JSON file containing information on delays at 7 airports over 10 years, the goal is to clean the data and search for insights about flight delays. Also, show which airports have the worst delays then determine the best month to fly if one wants to avoid delays of any length.

Imports and Functions

Show the code
import polars as pl
import numpy as np
from lets_plot import *

LetsPlot.setup_html(isolated_frame=True)
Show the code
url = 'https://raw.githubusercontent.com/byuidatascience/data4missing/master/data-raw/flights_missing/flights_missing.json'
flights_json = pl.read_json('flights_missing.json')
# flights_json = pl.read_json(url)
Show the code
# Beginning of cleaning the data

# First job: Fix years
flights_json = flights_json.with_columns(
    pl.when((pl.col('year').is_null()) & (pl.col('year').shift(1) == pl.col('year').shift(-1)))
    .then(pl.col('year').shift(1))
    .when((pl.col('year').is_null()) & (pl.col('month') == "January") & (pl.col('month').shift(1) == "December"))
    .then(pl.col('year').shift(-1))
    .otherwise(pl.col('year'))
    .alias('year')
)

# Second job: Fix months
# Fix spelling of February
flights_json = flights_json.with_columns(
    pl.col('month').replace({'Febuary': 'February'}).alias('month')
)

# Fix missing months
flights_json = flights_json.with_columns(
    pl.when((pl.col('month') == 'n/a') & (pl.col('month').shift(1) == pl.col('month').shift(-1)) & (pl.col('month').shift(1) != 'n/a'))
    .then(pl.col('month').shift(1))
    .when((pl.col('month') == 'n/a') & (pl.col('airport_code') < pl.col('airport_code').shift(1)) & (pl.col('month').shift(-1) != 'n/a'))
    .then(pl.col('month').shift(-1))
    .when((pl.col('month') == 'n/a') & (pl.col('airport_code') > pl.col('airport_code').shift(1)) & (pl.col('month').shift(1) != 'n/a'))
    .then(pl.col('month').shift(1))
    .when((pl.col('month') == 'n/a') & (pl.col('airport_code') < pl.col('airport_code').shift(-1)) & (pl.col('month').shift(-1) != 'n/a'))
    .then(pl.col('month').shift(-1))
    .otherwise(pl.col('month'))
    .alias('month')
)

# Second job: Fix airport names
airports = flights_json.select(['airport_code', 'airport_name']).filter((pl.col('airport_name') != "")).unique().sort(by='airport_code', descending=False)
airport_dic = dict(zip(airports['airport_code'], airports['airport_name']))
flights_json = flights_json.with_columns(airport_name=pl.col('airport_code').replace(airport_dic))

# Third job: Fix missing values in time delays
flights_json = flights_json.with_columns(
    pl.col('minutes_delayed_nas').fill_null(-999.0),
    pl.col('minutes_delayed_carrier').fill_null(-999.0)
)

flights_json = flights_json.with_columns(
    pl.when((pl.col('minutes_delayed_carrier') < 0) & (pl.col('minutes_delayed_nas') > 0))
    .then(pl.col('minutes_delayed_total') - pl.col('minutes_delayed_weather') - pl.col('minutes_delayed_security') - pl.col('minutes_delayed_late_aircraft') - pl.col('minutes_delayed_nas'))
    .when((pl.col('minutes_delayed_carrier') < 0) & (pl.col('minutes_delayed_nas') < 0))
    .then((pl.col('minutes_delayed_total') - pl.col('minutes_delayed_weather') - pl.col('minutes_delayed_security') - pl.col('minutes_delayed_late_aircraft')) / 2)
    .otherwise(pl.col('minutes_delayed_carrier'))
    .alias('minutes_delayed_carrier')
)

flights_json = flights_json.with_columns(
    pl.when(pl.col('minutes_delayed_nas') < 0)
    .then(pl.col('minutes_delayed_total') - pl.col('minutes_delayed_weather') - pl.col('minutes_delayed_security') - pl.col('minutes_delayed_late_aircraft') - pl.col('minutes_delayed_carrier'))
    .otherwise(pl.col('minutes_delayed_nas'))
    .alias('minutes_delayed_nas')
)

# Fourth job: Fix missing delay counts
flights_json = flights_json.with_columns(
    num_of_delays_carrier=pl.col('num_of_delays_carrier').replace('1500+', '-999')
)

flights_json = flights_json.with_columns(
    pl.col('num_of_delays_carrier').cast(pl.Int64).alias('num_of_delays_carrier')
)

flights_json = flights_json.with_columns(
    pl.when((pl.col('num_of_delays_late_aircraft') < 0) & (pl.col('num_of_delays_carrier') > 0))
    .then(pl.col('num_of_delays_total') - pl.col('num_of_delays_weather') - pl.col('num_of_delays_security') - pl.col('num_of_delays_nas') - pl.col('num_of_delays_carrier'))
    .when((pl.col('num_of_delays_late_aircraft') < 0) & (pl.col('num_of_delays_carrier') < 0))
    .then(((pl.col('num_of_delays_total') - pl.col('num_of_delays_weather') - pl.col('num_of_delays_security') - pl.col('num_of_delays_nas')) / 2).cast(pl.Int64))
    .otherwise(pl.col('num_of_delays_late_aircraft'))
    .alias('num_of_delays_late_aircraft')
)

flights_json = flights_json.with_columns(
    pl.when((pl.col('num_of_delays_carrier') < 0) & (pl.col('num_of_delays_late_aircraft') > 0))
    .then(pl.col('num_of_delays_total') - pl.col('num_of_delays_weather') - pl.col('num_of_delays_security') - pl.col('num_of_delays_nas') - pl.col('num_of_delays_late_aircraft'))
    .otherwise(pl.col('num_of_delays_carrier'))
    .alias('num_of_delays_carrier')
)

Question 1

Which delay is the worst delay? Compare weather delays with carrier and security delays. Compare the proportion of delay for each of the three categories in a Chart and a Table.

Weather delays make up 73% of all delays, whereas security delays make up less than 1%. Carrier delays result in a little over a quarter of the delays people encounter while flying.

Show the code
# delays / flights
flights_json = flights_json.with_columns(
    pl.sum_horizontal(['num_of_delays_late_aircraft', 'num_of_delays_nas', 'num_of_delays_weather']).alias('total_weather_delays'),
    (pl.col('num_of_delays_total') / pl.col('num_of_flights_total')).alias('delay_ratio')
)

flights_json = flights_json.with_columns(
    (pl.col('total_weather_delays') / pl.col('num_of_delays_total')).alias('weather_delay_ratio'),
    (pl.col('num_of_delays_security') / pl.col('num_of_delays_total')).alias('security_delay_ratio'),
    (pl.col('num_of_delays_carrier') / pl.col('num_of_delays_total')).alias('carrier_delay_ratio')
)
Show the code
delays = flights_json.select(
    [
    pl.col('weather_delay_ratio').mean(),
    pl.col('security_delay_ratio').mean(),
    pl.col('carrier_delay_ratio').mean()
    ]
)

# Need to convert the results DataFrame into long format for ggplot, this is identicle to the .melt() function, however melt is depricated
delays_lf = delays.unpivot(index=[], variable_name='columns', value_name='values')

delays_lf = delays_lf.sort('values', descending=True)

display(delays_lf.head(7))
shape: (3, 2)
columns values
str f64
"weather_delay_ratio" 0.734484
"carrier_delay_ratio" 0.263594
"security_delay_ratio" 0.001955
Show the code
delay_bar = (
    ggplot(data=delays_lf)
        + geom_bar(mapping = aes(x = 'columns', y = 'values', fill='columns', color='columns'), stat='identity', labels=layer_labels().line('@values').size(18))
        + guides(color="none")
        + labs(
          title="Proportion of total delays by type.",
          subtitle="The value represents the proprotion the delay is of total delays.",
          x="Delay",
          y="Delay Proportion",
          fill='Delay Types'
        )
        + theme(
            panel_background=element_rect(fill='gray'),
            plot_background=element_rect(fill='gray'),
            panel_grid_major=element_rect(fill='gray'),
            legend_background=element_rect(fill='gray'),
            axis_text=element_text(color='white'),
            axis_title=element_text(color='white'),
            plot_title=element_text(color='white'),
            plot_subtitle=element_text(color='white'),
            legend_text=element_text(color='white'),
            legend_title=element_text(color='white'),
            label_text=element_text(color='white')
        )
        + ggsize(1600, 900)
)

display(delay_bar)

Question 2

Create another chart that shows the proportion of delays for each reason (Weather, Carrier, and Security) across all 7 airports.

For all airports the biggest contributor to delays is the weather. Some airports are more impacted by weather delays than by carrier compared to other airports due to their location.

Show the code
delay_airport = flights_json.group_by('airport_code').agg(
    [
    pl.col('weather_delay_ratio').mean(),
    pl.col('security_delay_ratio').mean(),
    pl.col('carrier_delay_ratio').mean()
    ]
)

delay_airport_lf = delay_airport.unpivot(index='airport_code', variable_name='delay', value_name='value')

display(delay_airport.head(7))
shape: (7, 4)
airport_code weather_delay_ratio security_delay_ratio carrier_delay_ratio
str f64 f64 f64
"ATL" 0.767371 0.000938 0.231695
"SLC" 0.679115 0.003645 0.317464
"SAN" 0.666938 0.002772 0.330298
"ORD" 0.809055 0.001037 0.189886
"DEN" 0.729347 0.002102 0.268546
"SFO" 0.787536 0.001699 0.210787
"IAD" 0.702023 0.001491 0.296478
Show the code
delay_air_bar = (
    ggplot(data=delay_airport_lf)
        + geom_bar(mapping = aes(x = 'airport_code', y = 'value', fill='delay', color='delay'), stat='identity', position='dodge')
        + guides(color="none")
        + labs(
          title="Proportion of total delays by type.",
          subtitle="The value represents the proprotion the delay is of total delays.",
          x="Delay",
          y="Delay Proportion",
          fill='Delay Types'
        )
        + geom_text(aes(x = 'airport_code', y = 'value', label = 'value'), size=8, nudge_y=0.02, color='white')
        + theme(
            panel_background=element_rect(fill='gray'),
            plot_background=element_rect(fill='gray'),
            panel_grid_major=element_rect(fill='gray'),
            legend_background=element_rect(fill='gray'),
            axis_text=element_text(color='white'),
            axis_title=element_text(color='white'),
            plot_title=element_text(color='white'),
            plot_subtitle=element_text(color='white'),
            legend_text=element_text(color='white'),
            legend_title=element_text(color='white'),
            label_text=element_text(color='white')
        )
        + ggsize(1600, 960)
)

display(delay_air_bar)